qiime tools import \
--type NCBIAccessionIDs \
--input-path Protein.tsv \
--output-path Protein.qza
qiime fondue get-all \
--i-accession-ids Protein.qza \
--output-dir Meta_Analysis
#You will also need a metadata file that contains all the other data related to the sample from which the sequences are obtained
Manifest files: Protein_manifest
Metadata file: FINAL_Metadata.tsv
qiime tools import \
--type 'SampleData[SequencesWithQuality]' \
--input-path Protein_manifest \
--output-path Protein_demux_NEW.qza \
--input-format SingleEndFastqManifestPhred33
(Single reads)
qiime demux summarize \
--i-data Protein_demux_NEW.qza \
--o-visualization Protein_demux_NEW.qzv
qiime tools view Protein_demux_NEW.qzv
qiime dada2 denoise-single \
--i-demultiplexed-seqs Protein_demux_NEW.qza \
--p-trim-left 0 \
--p-trunc-len 150 \
--o-representative-sequences Protein_rep-seqs.qza \
--o-table Protein_table.qza \
--o-denoising-stats Protein_stats.qza
qiime feature-table summarize \
--i-table Protein_table.qza \
--o-visualization Protein_table.qzv \
--m-sample-metadata-file FINAL_Metadata.tsv
qiime feature-table tabulate-seqs \
--i-data Protein_rep-seqs.qza \
--o-visualization Protein_rep-seqs.qzv
qiime phylogeny align-to-tree-mafft-fasttree \
--i-sequences PROTEIN_NEW_rep-seqs.qza \
--o-alignment Protein_NEW_aligned-rep-seqs.qza \
--o-masked-alignment Protein_NEW_masked-aligned-rep-seqs.qza \
--o-tree Protein_NEW_unrooted-tree.qza \
--o-rooted-tree Protein_NEW_rooted-tree.qza
#This is necessary to begin to explore the taxonomic composition of the samples, and again relate that to sample metadata #Train classifier or download Silva or Greengene trained classifier
qiime feature-classifier classify-sklearn \
--i-classifier silva-138.1-ssu-nr99-classifier.qza \
--i-reads Protein_rep-seqs.qza \
--o-classification Protein_taxonomy.qza
knitr::opts_chunk$set(
message = FALSE,
warning = FALSE,
echo = TRUE
)
library(conflicted)
# Set conflict preferences
suppressMessages({
conflicts_prefer(microbiome::diversity)
conflicts_prefer(qiime2R::read_qza)
conflicts_prefer(microbiomeMarker::tax_table)
conflicts_prefer(microbiomeMarker::nsamples)
conflicts_prefer(stats::dist)
conflicts_prefer(microbiomeMarker::ntaxa)
conflicts_prefer(phyloseq::plot_heatmap)
conflicts_prefer(base::as.matrix)
conflicts_prefer(phyloseq::ntaxa)
conflicts_prefer(base::setdiff)
conflicts_prefer(plotly::filter)
conflicts_prefer(ggplot2::margin)
conflicts_prefer(base::rownames)
conflicts_prefer(base::colnames)
conflicts_prefer(ggplot2::annotate)
conflicts_prefer(microbiome::transform)
})
# List of required libraries
libraries <- c("NBZIMM", "broom", "caret", "cluster", "codyn",
"cowplot", "devtools", "dplyr",
"edgeR", "fpc", "ggplot2", "ggplotify", "ggpmisc",
"ggpubr", "ggRandomForests", "ggrepel", "ggsci",
"grid", "gridExtra", "igraph", "LDAvis",
"limma", "magrittr", "MASS", "metamicrobiomeR",
"microbiome", "patchwork", "phyloseq", "proxy", "randomForest",
"randomForestSRC", "RColorBrewer","scales", "slam",
"splitstackshape", "stm", "tensorflow", "tibble",
"tidyverse", "topicmodels", "usethis", "vegan", "lme4",
"rbiom", "qiime2R", "ggVennDiagram", "ggvenn",
"lattice", "permute", "SpiecEasi", "conflicted",
"biomformat", "mixOmics", "kableExtra", "nlme",
"glmmTMB", "gghighlight", "R.utils",
"microbiomeutilities", "microbiomeMarker", "microViz", "ggpubr", "plotly",
"pander", "pROC", "ggdendro", "grid", "ggpubr", "ggpattern")
# Load libraries silently
invisible(lapply(libraries, function(pkg) {
suppressWarnings(suppressPackageStartupMessages(library(pkg, character.only = TRUE)))
}))
Protein <- qza_to_phyloseq(
features = "TWO_table.qza", # Feature table file
tree = "Protein_rooted-tree.qza", # Phylogenetic tree file
taxonomy = "PROTEIN_NEW_taxonomy.qza", # Taxonomy file
metadata = "FINAL_Metadata.tsv" # Metadata file
)
# Filter taxa with counts greater than 1 across all samples
Protein_filt <- prune_taxa(taxa_sums(Protein) > 1, Protein)
# Output a summary of the filtered phyloseq object
Protein_filt
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 10123 taxa and 187 samples ]
## sample_data() Sample Data: [ 187 samples by 26 sample variables ]
## tax_table() Taxonomy Table: [ 10123 taxa by 7 taxonomic ranks ]
## phy_tree() Phylogenetic Tree: [ 10123 tips and 10061 internal nodes ]
Protein_filt@tax_table %>%
data.frame() %>%
group_by(Phylum) %>%
summarise(n = length(Genus))
## # A tibble: 36 × 2
## Phylum n
## <chr> <int>
## 1 Acidobacteriota 183
## 2 Actinobacteriota 757
## 3 Armatimonadota 14
## 4 Bacteroidota 2982
## 5 Bdellovibrionota 21
## 6 Caldisericota 3
## 7 Calditrichota 2
## 8 Campylobacterota 44
## 9 Chloroflexi 236
## 10 Cloacimonadota 2
## # ℹ 26 more rows
bac_Protein_filt <- subset_taxa(Protein_filt, Kingdom == "d__Bacteria")
bac_Protein_df <- psmelt(bac_Protein_filt)
#Now we can more easily summarise our metagenomes by sample using standard synta##
number_of_taxa <- bac_Protein_df %>%
dplyr::filter(Abundance > 5) %>%
group_by(Protein_source == "Plant", Genus) %>% #Protein_type can be changed to any group
summarise(n = length(Abundance))
number_of_taxa2 <- bac_Protein_df %>%
dplyr::filter(Abundance > 5) %>%
group_by(Protein_source == "Animal", Genus) %>% #Protein_type can be changed to any group
summarise(n = length(Abundance))
#To place the sets of genera in a list, we use
venn_data <- list(Plant = c(number_of_taxa$Genus[number_of_taxa$`Protein_source == "Plant"`]),
Animal = c(number_of_taxa2$Genus[number_of_taxa2$`Protein_source == "Animal"`]))
# Plot the Venn diagram and specify colors for each set
ggvenn(
venn_data,
fill_color = c("blue", "red"),
stroke_size = 0.5, set_name_size = 4
)
# Step 1: Extract unique taxa
plant_taxa <- number_of_taxa$Genus[number_of_taxa$`Protein_source == "Plant"`]
animal_taxa <- number_of_taxa2$Genus[number_of_taxa2$`Protein_source == "Animal"`]
unique_plant_taxa <- setdiff(plant_taxa, animal_taxa)
unique_animal_taxa <- setdiff(animal_taxa, plant_taxa)
# Preview unique taxa
#head(unique_plant_taxa)
#head(unique_animal_taxa)
# Step 2: Extract abundance data for unique taxa
unique_plant_data <- number_of_taxa[number_of_taxa$Genus %in% unique_plant_taxa, ]
unique_animal_data <- number_of_taxa2[number_of_taxa2$Genus %in% unique_animal_taxa, ]
# Optional: Preview the data frames
#head(unique_plant_data)
#head(unique_animal_data)
# Step 3: Summarize abundance for unique plant taxa
unique_plant_abundance <- unique_plant_data %>%
group_by(Genus) %>%
summarize(TotalAbundance = sum(n, na.rm = TRUE)) %>%
arrange(desc(TotalAbundance))
# Step 4: Identify the top 20 unique plant taxa
top_20_unique_taxa <- unique_plant_abundance$Genus[1:min(20, nrow(unique_plant_abundance))]
# Step 5: Group remaining taxa as "Others"
pie_data_combined <- unique_plant_data %>%
mutate(Taxa_Genus = ifelse(Genus %in% top_20_unique_taxa, Genus, "Others")) %>%
group_by(Taxa_Genus) %>%
summarize(TotalAbundance = sum(n, na.rm = TRUE)) %>%
mutate(Percentage = TotalAbundance / sum(TotalAbundance) * 100) %>%
arrange(desc(TotalAbundance))
#pie_data_combined
# Step 6: Create a distinct color palette
if (nrow(pie_data_combined) > 0) {
distinct_colors <- brewer.pal(n = 11, name = "Paired") # Primary palette
distinct_colors <- c(distinct_colors, brewer.pal(n = 7, name = "Dark2"), brewer.pal(n = 3, name = "Set3")) # Additional colors
distinct_colors <- distinct_colors[1:nrow(pie_data_combined)] # Ensure enough colors for the data
} else {
stop("Error: `pie_data_combined` is empty. Check your data processing steps.")
}
# Step 7: Add percentages to legend labels
if ("Taxa_Genus" %in% colnames(pie_data_combined) && "Percentage" %in% colnames(pie_data_combined)) {
pie_data_combined <- pie_data_combined %>%
mutate(Taxa_Genus = paste0(Taxa_Genus, " (", round(Percentage, 1), "%)"))
} else {
stop("Error: `pie_data_combined` does not have the required columns. Ensure `Taxa_Genus` and `Percentage` exist.")
}
# Step 8: Plot the pie chart
if (nrow(pie_data_combined) > 0) {
pie_chart_combined <- ggplot(pie_data_combined, aes(x = "", y = TotalAbundance, fill = Taxa_Genus)) +
geom_bar(stat = "identity", width = 1) +
coord_polar(theta = "y") +
labs(
title = "Unique Plant Taxa",
fill = "Genus"
) +
theme_void() +
theme(
legend.position = "bottom",
legend.key.size = unit(0.3, "cm"),
legend.text = element_text(size = 7),
plot.title = element_text(face = "bold", hjust = 0.5, size = 14),
plot.margin = margin(t = 10, r = 10, b = 30, l = 10)
) +
scale_fill_manual(values = distinct_colors)
# Step 9: Display the pie chart
print(pie_chart_combined)
}
# Step 1: Filter unique animal taxa
unique_animal_group <- unique_animal_data %>%
filter(Genus %in% unique_animal_taxa)
# Step 2: Summarize abundance for unique animal taxa
unique_animal_abundance <- unique_animal_group %>%
group_by(Genus) %>%
summarize(TotalAbundance = sum(n, na.rm = TRUE)) %>%
arrange(desc(TotalAbundance))
# Step 3: Identify the top 20 unique animal taxa
top_20_unique_animal_taxa <- unique_animal_abundance$Genus[1:min(20, nrow(unique_animal_abundance))]
# Step 4: Group remaining taxa as "Others"
pie_data_combined <- unique_animal_group %>%
mutate(Taxa_Genus = ifelse(Genus %in% top_20_unique_animal_taxa, Genus, "Others")) %>%
group_by(Taxa_Genus) %>%
summarize(TotalAbundance = sum(n, na.rm = TRUE)) %>%
mutate(Percentage = TotalAbundance / sum(TotalAbundance) * 100) %>%
arrange(desc(TotalAbundance))
# Step 5: Add percentages to legend labels
pie_data_combined <- pie_data_combined %>%
mutate(
Taxa_Genus = paste0(Taxa_Genus, " (", round(Percentage, 1), "%)") # Add percentage in brackets
)
# Step 6: Use the same distinct color palette as before
distinct_colors <- brewer.pal(n = 11, name = "Paired") # Primary palette
distinct_colors <- c(distinct_colors, brewer.pal(n = 7, name = "Dark2"), brewer.pal(n = 3, name = "Set3")) # Additional colors
distinct_colors <- distinct_colors[1:nrow(pie_data_combined)] # Ensure enough colors for the data
# Step 7: Plot the pie chart
pie_chart_combined <- ggplot(pie_data_combined, aes(x = "", y = TotalAbundance, fill = Taxa_Genus)) +
geom_bar(stat = "identity", width = 1) +
coord_polar(theta = "y") +
labs(
title = "Unique Animal Taxa",
fill = "Genus"
) +
theme_void() +
theme(
legend.position = "bottom",
legend.key.size = unit(0.3, "cm"), # Adjust legend size
legend.text = element_text(size = 7), # Reduce legend text size
plot.title = element_text(face = "bold", hjust = 0.5, size = 14), # Center and enlarge title
plot.margin = margin(t = 10, r = 10, b = 30, l = 10) # Add bottom margin
) +
scale_fill_manual(values = distinct_colors) # Apply distinct colors
# Step 8: Display the pie chart
print(pie_chart_combined)
# Step 1: Filter and clean the phyloseq object
Protein_filt <- Protein_filt %>%
tax_fix(unknowns = c("uncultured", "unclassified", "unknown", "NA")) %>% # Replace unknown values
tax_filter(min_prevalence = 1) %>% # Retain taxa with at least 1 prevalence
phyloseq_validate() # Validate the phyloseq object
#print(Protein_filt)
# Step 2: Check and handle NAs in the Genus rank
if (any(is.na(tax_table(Protein_filt)[, "Genus"]))) {
cat("NAs found in Genus rank. Replacing with 'Unknown'.\n")
tax_table(Protein_filt)[, "Genus"] <- ifelse(
is.na(tax_table(Protein_filt)[, "Genus"]),
"Unknown",
tax_table(Protein_filt)[, "Genus"]
)
}
# Step 3: Create the bar plot
p1 <- Protein_filt %>%
ps_select(Protein_source, Protein_level) %>% # Select relevant metadata columns
phyloseq::merge_samples(group = "Protein_source") %>% # Merge samples by "Protein_source"
comp_barplot(
tax_level = "Genus", # Aggregation level for plotting
n_taxa = 19, # Number of taxa to display
bar_width = 0.8, # Adjust bar width
position = position_dodge(width = 0.6), # Adjust spacing between bars
transform = function(x) x * 100 # Convert relative abundance to percentages
) +
labs(
x = "Protein Source", # X-axis label
y = "Relative Abundance (%)" # Y-axis label indicating percentage
) +
theme_classic() + # Apply a clean theme
theme(
axis.text.x = element_text(face = "bold", size = 12), # Customize x-axis text
axis.text.y = element_text(face = "bold", size = 12), # Customize y-axis text
axis.title.x = element_text(face = "bold", size = 14), # Bold x-axis title
axis.title.y = element_text(face = "bold", size = 14), # Bold y-axis title
legend.text = element_text(size = 10), # Adjust legend text size
legend.title = element_text(face = "bold") # Bold legend title
)
# Step 4: Ensure proper ordering of Protein_source levels
if ("Protein_source" %in% colnames(p1$data)) {
p1$data$Protein_source <- factor(p1$data$Protein_source, levels = c("Animal", "Plant"))
}
# Step 5: Display the plot
print(p1)
# Required Libraries
library(phyloseq)
library(ggplot2)
library(dplyr)
# Step 1: Extract Sequencing Depth
sample_depth <- data.frame(
SampleID = sample_names(Protein_filt), # Sample names
SequencingDepth = sample_sums(Protein_filt) # Total reads per sample
)
# Step 2: Plot Sequencing Depth
sequencing_depth_plot <- ggplot(sample_depth, aes(x = reorder(SampleID, -SequencingDepth), y = SequencingDepth)) +
geom_bar(stat = "identity", fill = "steelblue") + # Bar plot
theme_classic() + # Clean theme
labs(
x = "Samples",
y = "Sequencing Depth (Total Reads)",
title = "Sequencing Depth Across Samples"
) +
theme(
axis.text.x = element_blank(), # Remove x-axis text
axis.ticks.x = element_blank(), # Remove x-axis ticks
axis.text.y = element_text(face = "bold"), # Bold y-axis text
axis.title = element_text(face = "bold", size = 12), # Bold axis titles
plot.title = element_text(face = "bold", size = 14, hjust = 0.5) # Centered title
) +
geom_hline(
yintercept = median(sample_depth$SequencingDepth),
linetype = "dashed", color = "red", size = 1
) + # Median reference line
annotate(
"text",
x = 1,
y = median(sample_depth$SequencingDepth) + 500,
label = paste("Median =", median(sample_depth$SequencingDepth)),
color = "red", size = 4, hjust = 0
)
# Display Sequencing Depth Plot
print(sequencing_depth_plot)
# Step 3: Calculate Alpha Diversity
alpha_diversity <- estimate_richness(Protein_filt, measures = c("Shannon", "Observed")) %>%
mutate(
SequencingDepth = sample_sums(Protein_filt), # Add sequencing depth
SampleID = rownames(.) # Add sample IDs
)
# Step 4: Correlation Between Sequencing Depth and Shannon Index
cor_result <- cor.test(alpha_diversity$SequencingDepth, alpha_diversity$Shannon, method = "spearman")
cor_rho <- round(cor_result$estimate, 3) # Spearman rho
cor_pval <- signif(cor_result$p.value, 3) # p-value
# Step 5: Plot Sequencing Depth vs Alpha Diversity
shannon_plot <- ggplot(alpha_diversity, aes(x = SequencingDepth, y = Shannon)) +
geom_point(size = 3, color = "steelblue") + # Scatter plot
geom_smooth(method = "lm", color = "red", linetype = "dashed") + # Add linear trend line
labs(
title = "Relationship Between Sequencing Depth and Microbial Diversity",
x = "Sequencing Depth (Total Reads)",
y = "Shannon Diversity Index"
) +
theme_classic() + # Clean theme
theme(
plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
axis.title = element_text(face = "bold", size = 12),
axis.text = element_text(size = 10)
) +
annotate(
"text",
x = max(alpha_diversity$SequencingDepth) * 0.7,
y = max(alpha_diversity$Shannon) * 0.95,
label = paste0("Spearman's rho = ", cor_rho, "\nP-value = ", cor_pval),
color = "red", size = 5, hjust = 0
)
# Display Shannon Plot
print(shannon_plot)
# Step 1: Calculate Alpha Diversity (e.g., Shannon Index)
alpha_div <- estimate_richness(Protein_filt, measures = "Shannon")
# Step 2: Combine Alpha Diversity with Metadata
metadata <- data.frame(sample_data(Protein_filt))
alpha_div_df <- cbind(alpha_div, metadata)
# Step 3: Convert Categorical Variables to Numeric
numeric_vars <- c("Protein_source", "Country", "Isolation_source", "Protein_level", "Age_in_weeks")
alpha_div_df[numeric_vars] <- lapply(alpha_div_df[numeric_vars], function(x) as.numeric(factor(x)))
# Step 4: Compute Correlation Matrix for Predictors
cor_matrix <- cor(alpha_div_df[, numeric_vars], use = "complete.obs")
print("Correlation Matrix:")
## [1] "Correlation Matrix:"
# Step 5: Fit Linear Mixed-Effects Models
# Model 1: Protein Source as fixed effect, Country as random effect
model1 <- lmer(Shannon ~ Protein_source + (1 | Isolation_source), data = alpha_div_df)
# Model 2: Add Protein Level and Isolation Source as fixed effects
model2 <- lmer(Shannon ~ Protein_source + Protein_level + Country + (1 | Isolation_source), data = alpha_div_df)
# Model 3: Add Age in Weeks as a fixed effect
model3 <- lmer(Shannon ~ Protein_source + Protein_level + Age_in_weeks + Country + (1 | Isolation_source), data = alpha_div_df)
# Model 4: Protein Source and Protein Level as fixed effects
model4 <- lmer(Shannon ~ Protein_source + Protein_level + (1 | Isolation_source), data = alpha_div_df)
# Model 5: Add Age in Weeks and Country as fixed effects
model5 <- lmer(Shannon ~ Protein_source + Protein_level + Age_in_weeks + Country + (1 | Isolation_source), data = alpha_div_df)
# Model 6: Protein Source, Protein Level, and Age in Weeks as fixed effects
model6 <- lmer(Shannon ~ Protein_level + Protein_source + Age_in_weeks + Country + (1 | Isolation_source), data = alpha_div_df)
# Step 6: Summarize Models
cat("Model Summaries:\n")
## Model Summaries:
# Step 7: Model Comparison
cat("Likelihood Ratio Test for All Models:\n")
## Likelihood Ratio Test for All Models:
print(anova(model1, model2, model3, model4, model5, model6))
## Data: alpha_div_df
## Models:
## model1: Shannon ~ Protein_source + (1 | Isolation_source)
## model4: Shannon ~ Protein_source + Protein_level + (1 | Isolation_source)
## model2: Shannon ~ Protein_source + Protein_level + Country + (1 | Isolation_source)
## model3: Shannon ~ Protein_source + Protein_level + Age_in_weeks + Country + (1 | Isolation_source)
## model5: Shannon ~ Protein_source + Protein_level + Age_in_weeks + Country + (1 | Isolation_source)
## model6: Shannon ~ Protein_level + Protein_source + Age_in_weeks + Country + (1 | Isolation_source)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## model1 4 484.57 497.49 -238.28 476.57
## model4 5 485.61 501.77 -237.81 475.61 0.9585 1 0.3276
## model2 6 469.24 488.63 -228.62 457.24 18.3676 1 1.821e-05 ***
## model3 7 470.82 493.43 -228.41 456.82 0.4266 1 0.5137
## model5 7 470.82 493.43 -228.41 456.82 0.0000 0
## model6 7 470.82 493.43 -228.41 456.82 0.0000 0
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
alpha_div <- data.frame(
Observed = rowSums(otu_table_data > 0), # Count of observed taxa
Shannon = diversity(otu_table_data, index = "shannon"), # Shannon index
Simpson = diversity(otu_table_data, index = "simpson"), # Simpson index
ACE = apply(otu_table_data, 1, function(x) sum(x > 0) / mean(x[x > 0])), # ACE approximation
Chao1 = apply(otu_table_data, 1, function(x) { # Chao1 index
S_obs <- sum(x > 0)
singletons <- sum(x == 1)
doubletons <- sum(x == 2)
if (doubletons > 0) S_obs + (singletons^2) / (2 * doubletons) else NA
})
)
# Step 4: Combine Alpha Diversity Metrics with Metadata
metadata <- data.frame(sample_data(Protein_filt))
if (!"Protein_source" %in% colnames(metadata)) stop("Protein_source column missing in metadata.")
# Match alpha diversity metrics to metadata samples
alpha_div <- alpha_div[rownames(metadata), , drop = FALSE]
alpha_div <- cbind(metadata, alpha_div)
# Step 5: Prepare for ROC Curve Analysis
if (length(unique(alpha_div$Protein_source)) != 2) {
stop("Protein_source must have exactly two groups for ROC analysis.")
}
# Define metrics and colors
metrics <- c("Observed", "Shannon", "Simpson", "ACE", "Chao1")
colors <- c("red", "blue", "green", "purple", "orange")
legend_labels <- c()
# Step 6: Plot Multi-Curve ROC with AUC and P-Values
plot(NULL, xlim = c(0, 1), ylim = c(0, 1),
xlab = "False Positive Rate", ylab = "True Positive Rate",
main = "ROC Curves for Alpha Diversity Metrics (Protein Source)", type = "n")
lines(c(0, 1), c(1, 0), col = "gray", lty = 2) # Diagonal reference line
for (i in seq_along(metrics)) {
metric <- metrics[i]
if (any(is.na(alpha_div[[metric]]) | is.infinite(alpha_div[[metric]]))) {
warning(paste("Skipping", metric, "due to invalid values"))
next
}
# ROC and AUC calculation
roc_result <- roc(alpha_div$Protein_source, alpha_div[[metric]])
auc_value <- auc(roc_result)
# Wilcoxon test for significance
p_value <- wilcox.test(alpha_div[[metric]] ~ alpha_div$Protein_source)$p.value
# Add curve and update legend
lines(roc_result, col = colors[i], lwd = 2)
legend_labels <- c(legend_labels, paste(metric, "(AUC =", round(auc_value, 3), ", p =", signif(p_value, 3), ")"))
}
# Add legend to the plot
legend("bottomleft", legend = legend_labels, col = colors, lwd = 2, title = "Alpha Diversity Metrics")
cat("ROC curve successfully plotted.\n")
# Check sample data and group counts
table(sample_data(Protein_filt)$Protein_source) # Display counts for each group in Protein_source
##
## Animal Plant
## 155 32
# Plot alpha diversity metrics
alpha_diversity_plot <- Protein_filt %>%
plot_richness(
x = "Protein_source", # Group by Protein_source
measures = c("Shannon", "Chao1", "InvSimpson") # Select alpha diversity measures
) +
geom_boxplot(
aes(fill = Protein_source),
outlier.shape = 16, # Shape of outlier points
outlier.size = 0.1, # Size of outlier points
outlier.alpha = 0.9, # Transparency of outliers
show.legend = FALSE # Hide legend for boxplots
) +
labs(
x = "Protein Source",
y = "Alpha Diversity Index",
title = "Alpha Diversity Across Protein Sources"
) +
stat_compare_means(size = 4) + # Add p-values for comparisons
theme_classic() + # Use a clean classic theme
scale_fill_manual(values = c("Plant" = "blue", "Animal" = "red")) + # Custom fill colors
theme(
strip.text = element_text(face = "bold", size = 12), # Bold facet labels
axis.title.x = element_text(face = "bold", size = 12), # Bold x-axis title
axis.title.y = element_text(face = "bold", size = 12), # Bold y-axis title
plot.title = element_text(face = "bold", size = 14, hjust = 0.5) # Centered bold title
)
# Display the plot
alpha_diversity_plot
# Step 1: CLR Transformation
Protein_filt_clr <- transform(Protein_filt1, "clr") # Centered log-ratio transformation
# Step 2: Calculate Aitchison Distance
otu_clr <- otu_table(Protein_filt_clr) # Extract transformed OTU table
aitchison_dist <- dist(t(otu_clr), method = "euclidean") # Aitchison distance
# Step 3: PERMANOVA Analysis
# Extract metadata as a data frame
metadata <- as(sample_data(Protein_filt1), "data.frame")
# Run PERMANOVA
permanova_res <- adonis2(aitchison_dist ~ Protein_source, data = metadata, permutations = 999)
# View the PERMANOVA results
print(permanova_res)
## Permutation test for adonis under reduced model
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = aitchison_dist ~ Protein_source, data = metadata, permutations = 999)
## Df SumOfSqs R2 F Pr(>F)
## Model 1 47072 0.05496 10.758 0.001 ***
## Residual 185 809474 0.94504
## Total 186 856545 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Extract R² and p-value
R2 <- round(permanova_res$R2[1], 3) # Proportion of variance explained
p_value <- signif(permanova_res$`Pr(>F)`[1], 3) # p-value
# Step 4: PCoA Ordination
ProteinPCA_aitchison <- cmdscale(aitchison_dist, k = 2, eig = TRUE) # Perform classical MDS
PCoA_scores <- as.data.frame(ProteinPCA_aitchison$points) # Extract PCoA scores
colnames(PCoA_scores) <- c("PCoA1", "PCoA2")
PCoA_scores$Protein_source <- metadata$Protein_source # Add metadata for grouping
# Calculate explained variance for PCoA axes
eigenvalues <- ProteinPCA_aitchison$eig
positive_eigenvalues <- eigenvalues[eigenvalues > 0]
variance_explained <- positive_eigenvalues / sum(positive_eigenvalues) * 100
# Step 5: Plot PCoA with Annotated R² and p-value
Proteingraph_aitchison <- ggplot(PCoA_scores, aes(x = PCoA1, y = PCoA2, color = Protein_source)) +
geom_point(size = 3) + # Plot points
stat_ellipse(aes(fill = Protein_source), alpha = 0.2, geom = "polygon") + # Add confidence ellipses
labs(
title = "PCoA Ordination of Aitchison Distance",
x = paste0("PCoA1 (", round(variance_explained[1], 2), "%)"),
y = paste0("PCoA2 (", round(variance_explained[2], 2), "%)"),
color = "Protein Source",
fill = "Protein Source"
) +
scale_color_manual(values = c("Plant" = "blue", "Animal" = "red")) + # Custom colors
scale_fill_manual(values = c("Plant" = "blue", "Animal" = "red")) + # Custom colors
annotate(
"text",
x = max(PCoA_scores$PCoA1) * 0.9,
y = min(PCoA_scores$PCoA2) * 0.9,
label = paste0("PERMANOVA R² = ", R2, "\n", "p-value = ", p_value),
size = 4, hjust = 0.4, color = "black"
) +
theme_classic2() +
theme(
legend.title = element_text(face = "bold"),
axis.title = element_text(face = "bold", size = 12),
axis.text = element_text(size = 10)
)
# Display the plot
Proteingraph_aitchison
# CLR Transformation and Aitchison Distance (from previous steps)
Protein_filt_clr <- transform(Protein_filt1, "clr") # CLR transformation
aitchison_dist <- dist(t(otu_table(Protein_filt_clr)), method = "euclidean") # Aitchison distance
# Extract metadata
metadata <- as(sample_data(Protein_filt1), "data.frame")
grouping_var <- metadata$Protein_source # Plant/Animal grouping
# Step 1: Run ANOSIM
anosim_res <- anosim(aitchison_dist, grouping_var, permutations = 999)
R_value <- round(anosim_res$statistic, 3)
p_value <- signif(anosim_res$signif, 3)
# Step 2: Prepare Dissimilarity Data Frame
dissimilarity_df <- as.data.frame(as.table(as.matrix(aitchison_dist)))
colnames(dissimilarity_df) <- c("Sample1", "Sample2", "Distance")
# Add grouping information
dissimilarity_df$Group1 <- metadata$Protein_source[match(dissimilarity_df$Sample1, rownames(metadata))]
dissimilarity_df$Group2 <- metadata$Protein_source[match(dissimilarity_df$Sample2, rownames(metadata))]
# Define comparison types
dissimilarity_df$Type <- ifelse(
dissimilarity_df$Group1 == dissimilarity_df$Group2,
paste0("Within-", dissimilarity_df$Group1),
"Between-Groups"
)
# Remove duplicates and self-comparisons
dissimilarity_df <- dissimilarity_df[dissimilarity_df$Sample1 != dissimilarity_df$Sample2, ]
# Step 3: Normalize Distances
dissimilarity_df$Normalized_Distance <- scales::rescale(dissimilarity_df$Distance, to = c(0, 1))
# Step 4: Calculate Medians
medians <- aggregate(Normalized_Distance ~ Type, data = dissimilarity_df, FUN = median)
# Step 5: Create ANOSIM Boxplot
anosim_plot <- ggplot(dissimilarity_df, aes(x = Type, y = Normalized_Distance, fill = Type)) +
geom_boxplot(alpha = 0.7, outlier.shape = NA) + # Boxplot without outlier dots
stat_summary(fun = "median", geom = "point", size = 4, color = "black", shape = 18) + # Median points
stat_summary(fun = "median", geom = "text", aes(label = round(..y.., 3)), vjust = -1.0, size = 3.5) + # Median labels
scale_y_continuous(limits = c(0, 1)) + # Normalize distance axis
annotate("text", x = 2, y = 0.95, label = paste0("ANOSIM: R = ", R_value, ", p = ", p_value), size = 4, fontface = "bold") + # ANOSIM results
labs(
x = "Group Comparison",
y = "Aitchison Dissimilarity Distance"
) +
scale_fill_manual(values = c("Within-Plant" = "blue", "Within-Animal" = "red", "Between-Groups" = "purple")) + # Custom colors
theme_classic() +
theme(
axis.title.x = element_text(face = "bold", size = 12),
axis.text.x = element_text(size = 10),
axis.title.y = element_text(face = "bold", size = 12),
legend.text = element_text(size = 10),
legend.title = element_text(face = "bold", size = 12),
axis.text.y = element_text(size = 10)
)
# Display the Plot
anosim_plot
set.seed(28)
predictprotein <- t(otu_table(Protein_filt))
typeres <- sample_data(Protein_filt)$Protein_source
rfprot <- data.frame(typeres, predictprotein)
# Split data
trainIndex <- createDataPartition(rfprot$typeres, p = 0.7, list = FALSE)
trainprot <- rfprot[trainIndex, ] # Subset the data for training
testprot <- rfprot[-trainIndex, ] # Subset the data for testing
cat("Training Set Size:", nrow(trainprot), "\n")
## Training Set Size: 132
cat("Test Set Size:", nrow(testprot), "\n")
## Test Set Size: 55
# Fit the Random Forest model using the training set
typeclassval <- rfsrc(typeres ~ ., data = trainprot, ntree = 1000, nodesize = 1,
save.memory = TRUE, forest = TRUE)
typeclassval
## Sample size: 132
## Frequency of class labels: 109, 23
## Number of trees: 1000
## Forest terminal node size: 1
## Average no. of terminal nodes: 6.661
## No. of variables tried at each split: 101
## Total no. of variables: 10123
## Resampling used to grow trees: swor
## Resample size used to grow trees: 83
## Analysis: RF-C
## Family: class
## Splitting rule: gini *random*
## Number of random split points: 10
## Imbalanced ratio: 4.7391
## (OOB) Brier score: 0.02295572
## (OOB) Normalized Brier score: 0.09182289
## (OOB) AUC: 0.99680893
## (OOB) Log-loss: 0.0752792
## (OOB) PR-AUC: 0.98661547
## (OOB) G-mean: 0.98147988
## (OOB) Requested performance error: 0.03030303, 0.03669725, 0
##
## Confusion matrix:
##
## predicted
## observed Animal Plant class.error
## Animal 105 4 0.0367
## Plant 0 23 0.0000
##
## (OOB) Misclassification rate: 0.03030303
# Make predictions on the test set
predictions <- predict(typeclassval, newdata = testprot)$class
# Extract actual classes from the test set
true_labels <- testprot$typeres
# Create confusion matrix manually using table()
conf_table <- table(Predicted = predictions, Actual = true_labels)
conf_table
## Actual
## Predicted Animal Plant
## Animal 45 1
## Plant 1 8
# Create confusion matrix
conf_matrix <- table(Predicted = predictions, Actual = true_labels)
# Calculate accuracy
accuracy_random <- sum(diag(conf_matrix)) / sum(conf_matrix)
cat("Accuracy on test data:", round(accuracy_random * 100, 2), "%\n")
## Accuracy on test data: 96.36 %
# Convert confusion table to a data frame for ggplot visualization
conf_df <- as.data.frame(conf_table)
# Calculate total number of predictions for accuracy calculation
total_predictions <- sum(conf_table)
total_predictions
## [1] 55
# Calculate the class-specific accuracy for each cell
conf_df$Accuracy <- (conf_df$Freq / total_predictions) * 100
# Create a label for each cell that contains both the count and percentage
conf_df$Label <- paste0(conf_df$Freq, " (", round(conf_df$Accuracy, 1), "%)")
# Plot the confusion matrix using ggplot2
confusion_plot <- ggplot(data = conf_df, aes(x = Predicted, y = Actual, fill = Freq)) +
geom_tile(color = "white") +
geom_text(aes(label = Label), color = "black") + # Add the text label with count and accuracy
scale_fill_gradient(low = "white", high = "blue") +
labs(title = "Test data Confusion Matrix",
x = "Predicted",
y = "Actual",
fill = "Frequency") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(size = 14, face = "bold"), # Reduce title label size
axis.title.x = element_text(size = 12, face = "bold"), # Increase x-axis label size
axis.title.y = element_text(size = 12, face = "bold") # Increase y-axis label size
)
confusion_plot
# Make Training Dataset
predict <- t(otu_table(Protein_filt))
dim(predict)
## [1] 187 10123
# Create response variable
# Create response variable for samples with class as "Protein_source"
res <- sample_data(Protein_filt)$Protein_source
# Combine them into 1 data frame
Protein_rf <- data.frame(res, predict)
dim(Protein_rf) #855 samples with 711 OTUs
## [1] 187 10124
set.seed(2000)
proteinclass <- randomForest(res ~ ., data=Protein_rf, ntree = 1000, importance = TRUE)
print(proteinclass)
##
## Call:
## randomForest(formula = res ~ ., data = Protein_rf, ntree = 1000, importance = TRUE)
## Type of random forest: classification
## Number of trees: 1000
## No. of variables tried at each split: 100
##
## OOB estimate of error rate: 2.14%
## Confusion matrix:
## Animal Plant class.error
## Animal 154 1 0.006451613
## Plant 3 29 0.093750000
# Extract importance scores
impcl <- randomForest::importance(proteinclass)
impcl_df <- data.frame(OTUID = rownames(impcl), impcl)
# Remove 'X' prefix from 'OTUID'y
impcl_df$OTUID <- gsub("X", "", impcl_df$OTUID)
# Prepare the OTU dataframe with taxonomic information
otu_df <- as.data.frame(tax_table(Protein_filt))
otu_df$OTUID <- rownames(otu_df)
# Assuming impcl_df is your data frame with taxa and their MDA values
positive_impact_taxa <- impcl_df %>%
filter(MeanDecreaseAccuracy > 0) %>%
arrange(desc(MeanDecreaseAccuracy)) %>%
top_n(10, MeanDecreaseAccuracy)
# Filter out taxa with negative MDA values
positive_impact_taxa <- impcl_df[impcl_df$MeanDecreaseAccuracy > 0, ]
# Order by MDA in descending order and select the top 10
positive_impact_taxa <- positive_impact_taxa[order(-positive_impact_taxa$MeanDecreaseAccuracy), ]
top_10_positive_taxa <- head(positive_impact_taxa, 10)
# Merge importance scores with taxonomic information
imp_merged <- merge(top_10_positive_taxa, otu_df, by = "OTUID")
imp_merged <- imp_merged %>% tidyr::replace_na(list(Kingdom = "Unassigned",
Phylum = "Unassigned",
Class = "Unassigned",
Order = "Unassigned",
Family = "Unassigned",
Genus = "Unassigned",
Species = "Unassigned"))
#Graph itself for top 10 OTUs
colors <- brewer.pal(12, "Paired")
ggplot(imp_merged, aes(x = reorder(Genus, MeanDecreaseAccuracy, decreasing = TRUE), y = MeanDecreaseAccuracy, fill = Family)) +
geom_bar(stat = "identity", color = "black") +
coord_flip() +
theme_classic() +
theme(legend.position = "top") +
scale_color_manual(values = colors) +
labs(x = "Top Taxa for classifying \n Plant vs Animal Protein",
y = "Mean Decrease Accuracy (%)",
fill = "")
lefse_results_file <- "lefse_results.rds"
if (!file.exists(lefse_results_file)) {
lefse <- run_lefse(
Protein_filt,
group = "Protein_source", # Metadata column for grouping
wilcoxon_cutoff = 0.01,
lda_cutoff = 3,
multigrp_strat = TRUE,
kw_cutoff = 0.01
)
saveRDS(lefse, lefse_results_file)
cat("LEfSe analysis completed and results saved.\n")
} else {
lefse <- readRDS(lefse_results_file)
cat("LEfSe results loaded.\n")
}
## LEfSe results loaded.
# Prepare marker data for plotting
marker_data <- data.frame(
feature = marker_table(lefse)$feature,
enrich_group = as.character(marker_table(lefse)$enrich_group),
ef_lda = marker_table(lefse)$ef_lda,
pvalue = marker_table(lefse)$pvalue
)
# Process marker data
marker_data <- marker_data %>%
mutate(
genus = sub(".*\\|", "", feature), # Extract genus name
LDA_Adjusted = ifelse(enrich_group == "Plant", ef_lda, -ef_lda) # Adjust LDA scores
) %>%
filter(LDA_Adjusted != 0) %>% # Exclude taxa with zero LDA score
arrange(enrich_group, LDA_Adjusted) # Sort by group and LDA score
# Create the bar plot
lefse_bar_plot <- ggplot(marker_data, aes(x = reorder(genus, LDA_Adjusted), y = LDA_Adjusted, fill = enrich_group)) +
geom_bar(stat = "identity", width = 0.8) +
geom_text(
data = marker_data %>% filter(enrich_group == "Animal"),
aes(label = genus, y = 0.1),
hjust = 0, size = 3, color = "black"
) +
geom_text(
data = marker_data %>% filter(enrich_group == "Plant"),
aes(label = genus, y = -0.1),
hjust = 1, size = 3, color = "black"
) +
coord_flip() +
theme_classic() +
theme(
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
plot.title = element_text(face = "bold", hjust = 0.5)
) +
labs(
y = "LDA score (log10)",
x = NULL
) +
scale_y_continuous(
limits = c(-5, 6),
breaks = seq(-5, 6, by = 1)
) +
scale_fill_manual(values = c("Plant" = "blue", "Animal" = "red"), name = "Group")
# Print the plot
print(lefse_bar_plot)
plot_cladogram(lefse, color = c("blue", "red"), clade_label_level = 4)
Metabolic_flux <- import_qiime_sample_data("META_flux1.tsv")
dim(Metabolic_flux)
## [1] 86 188
Metadf <- data.frame(Metabolic_flux)
Metadf <- Metadf[,-1]
dim(Metadf)
## [1] 86 187
#Make Training Dataset
predict <- t(Metadf)
dim(predict)
## [1] 187 86
# We have 116 samples representing 353 pathways
dim(sample_data(Protein_filt))
## [1] 187 26
res <- as.factor(sample_data(Protein_filt)$Protein_source)
res <- sample_data(Metadf)$Protein_source
dim(res)
## NULL
head(sample_data(Protein_filt)$Protein_source)
## [1] Plant Plant Plant Plant Plant Plant
## Levels: Animal Plant
res <- sample_data(Protein_filt)$Protein_source
res <- as.factor(res) # Convert to factor
# Create Response Variable and Dataset
rf <- data.frame(res, predict)
# Random Forest Model (OOB)
class <- rfsrc(res ~ ., data = rf, ntree = 1000,
nodesize = 1, save.memory = TRUE, seed = 8)
imp <- vimp(class, importance = "permute")$importance
# Turn into dataframe and rename the one column into Var Imp
impdf <- data.frame(imp)
colnames(impdf) <- c("all", "Animal", "Plant")
# Add the pathways into the data as its own column
impdf2 <- impdf %>% add_column(Metabolite = rownames(data.frame(Metadf)))
# Arrange the variable importance values
arrangedimp <- arrange(impdf2, desc(all))
# Take only the non-zero VarImp
impfeat <- arrangedimp[arrangedimp$all != 0, ]
dim(impfeat)
## [1] 66 4
# Filter top 20
top10impfeat <- arrange(impfeat, desc(all))[1:20, ]
# Graph top 10 metabolites
colors <- brewer.pal(10, "Paired")
ggplot(top10impfeat, aes(x = reorder(Metabolite, all, decreasing = TRUE), y = all, fill = Metabolite)) +
geom_bar(stat = "identity") +
coord_flip() +
theme_classic() +
theme(legend.position = "top") +
scale_color_manual(values = colors) +
labs(x = "Top Metabolites for Classifying\n Diet Type",
y = "OOB Variable Importance",
fill = "")
# Validation Model
ind <- sample(2, nrow(rf), replace = TRUE, prob = c(0.7, 0.3))
train <- rf[ind == 1, ]
test <- rf[ind == 2, ]
# Generate Initial Validation Classification Model
set.seed(28)
typeclassval <- rfsrc(res ~ ., data = train, ntree = 1000, nodesize = 1,
save.memory = TRUE, seed = 28, forest = TRUE)
# Perform Validation
rfttpredprot <- predict.rfsrc(typeclassval, test, seed = 28)
# Variable Importance for Validation
impvalp2 <- vimp(typeclassval, importance = "permute")$importance
impdfvalp2 <- data.frame(impvalp2)
colnames(impdfvalp2)[which(names(impdfvalp2) == "all")] <- "VarImp"
impdf2valp2 <- impdfvalp2 %>% add_column(Metabolite = rownames(data.frame(Metadf)))
# Arrange the variable importance values
arrangedimpvalp2 <- dplyr::arrange(impdf2valp2, desc(VarImp))
impfeatvalp2 <- arrangedimpvalp2[arrangedimpvalp2$VarImp != 0, ]
dim(impfeatvalp2)
## [1] 65 4
# Graph the top 10 pathways
top10impfeatvalp2 <- impfeatvalp2[1:20, ]
colors <- brewer.pal(10, "Paired")
ggplot(top10impfeatvalp2, aes(x = reorder(Metabolite, VarImp, increasing = TRUE), y = VarImp, fill = Metabolite)) +
geom_bar(stat = "identity") +
coord_flip() +
theme_classic() +
theme(legend.position = "right") +
scale_color_manual(values = colors) +
labs(x = "Metabolic flux for classifying Protein Source",
y = "Validation Variable Importance",
fill = "")
flux_metadata <- read.csv("META_flux.csv")
flux_metadata <- flux_metadata[!duplicated(flux_metadata[, 1]), ]
rownames(flux_metadata) <- flux_metadata[, 1]
flux_metadata <- flux_metadata[, -1] # Drop column used for row names
# Load taxa abundance
taxa_abundance <- read.csv("microbial_abundance.csv", row.names = 1)
# Load metadata for Protein source
sample_metadata <- data.frame(
Sample = colnames(flux_metadata),
Protein_source = c("Plant","Animal")
)
### Compute Correlation and P-values
cor.mtest <- function(mat1, mat2, method = "spearman") {
p_mat <- matrix(NA, nrow = ncol(mat1), ncol = ncol(mat2))
for (i in 1:ncol(mat1)) {
for (j in 1:ncol(mat2)) {
test <- cor.test(mat1[, i], mat2[, j], method = method)
p_mat[i, j] <- test$p.value
}
}
return(list(p = p_mat))
}
cor_matrix <- cor(t(taxa_abundance), flux_metadata, method = "spearman")
p_values <- cor.mtest(t(taxa_abundance), flux_metadata, method = "spearman")$p
### Filter Significant Taxa and Add Markers
significant_taxa <- apply(p_values, 1, function(row) any(row < 0.05))
cor_matrix_significant <- cor_matrix[significant_taxa, ]
p_values_significant <- p_values[significant_taxa, ]
significant_markers <- ifelse(p_values_significant < 0.05, "*", "")
### Top 50 Taxa Analysis
max_cor <- apply(abs(cor_matrix_significant), 1, max)
ranked_taxa <- names(sort(max_cor, decreasing = TRUE))
top_50_taxa <- ranked_taxa[1:50]
cor_matrix_top50 <- cor_matrix_significant[top_50_taxa, , drop = FALSE]
p_values_top50 <- p_values_significant[top_50_taxa, , drop = FALSE]
significant_markers_top50 <- ifelse(p_values_top50 < 0.05, "*", "")
# Calculate Total Metabolic Flux for Each Metabolite
total_flux <- rowSums(flux_metadata)
# Create DataFrame of Metabolites and Their Total Flux
metabolite_flux <- data.frame(
Metabolite = rownames(flux_metadata),
Total_Flux = total_flux
)
# Match metabolites to their Protein source based on sample association
metabolite_flux <- metabolite_flux %>%
mutate(Protein_source = apply(
flux_metadata[Metabolite, ], 1,
function(row) {
# Determine the dominant Protein source by summing fluxes per group
source_sums <- tapply(row, sample_metadata$Protein_source, sum)
names(which.max(source_sums)) # Assign the Protein source with the highest total flux
}
))
# Filter the correlation matrix and annotations for the matched metabolites
cor_matrix_top50 <- cor_matrix_top50[metabolite_flux$Metabolite, , drop = FALSE]
annotation_col <- data.frame(
Protein_source = metabolite_flux$Protein_source
)
rownames(annotation_col) <- metabolite_flux$Metabolite
# Define Annotation Colors
annotation_colors <- list(
Protein_source = c("Plant" = "green", "Animal" = "orange")
)
# Order Columns by Protein Source
ordered_cols <- order(annotation_col$Protein_source)
cor_matrix_top50 <- cor_matrix_top50[, ordered_cols, drop = FALSE]
annotation_col <- annotation_col[ordered_cols, , drop = FALSE]
### Assign Genus and Phylum Annotations Dynamically
taxonomy <- as.data.frame(tax_table(Protein_filt)) # Replace with your phyloseq object
taxonomy <- taxonomy %>%
tibble::rownames_to_column(var = "OTU") %>%
tidyr::replace_na(list(Genus = "Unknown", Phylum = "Unknown"))
otu_to_genus <- taxonomy$Genus
names(otu_to_genus) <- taxonomy$OTU
otu_to_phylum <- taxonomy$Phylum
names(otu_to_phylum) <- taxonomy$OTU
rownames(cor_matrix_top50) <- otu_to_genus[rownames(cor_matrix_top50)]
annotation_row <- data.frame(Phylum = otu_to_phylum[rownames(cor_matrix_top50)])
rownames(annotation_row) <- rownames(cor_matrix_top50)
# Define Phylum Colors
phylum_colors <- setNames(
rainbow(length(unique(annotation_row$Phylum))),
unique(annotation_row$Phylum)
)
annotation_colors <- c(
annotation_colors,
list(Phylum = phylum_colors)
)
### Generate Heatmap
pheatmap(
cor_matrix_top50,
color = colorRampPalette(c("blue", "white", "red"))(100),
display_numbers = significant_markers_top50,
cluster_rows = TRUE,
cluster_cols = FALSE,
main = "Heatmap: Top 50 Taxa vs Metabolite Flux (Spearman Correlation)",
annotation_col = annotation_col,
annotation_row = annotation_row,
annotation_colors = annotation_colors
)
# Load datasets
flux <- read.csv("META_exchange_fluxes_EXPORT.csv", header = TRUE)
df.flux <- read.csv("FINAL_Metadata.csv", header = TRUE)
df.reduced <- read.csv("META_reduced.csv", header = TRUE)
# Merge datasets
flux_df_merged <- merge(flux, df.flux, by = "SampleID")
flux.merged <- merge(flux_df_merged, df.reduced, by = "SampleID")
# Clean taxon names by removing "g__" prefix
flux.merged$taxon <- sub("^g__", "", flux.merged$taxon)
# Filter for "isobut_m", "isocapr_m", "isoval_m", "4hphac_m", "2hyoxplac_m" metabolites
Metabolite <- subset(flux.merged, metabolite %in% c("isobut_m", "isocapr_m", "isoval_m", "4hphac_m", "2hyoxplac_m"))
# Calculate maximum y-values for each metabolite
label_positions <- Metabolite %>%
group_by(metabolite) %>%
summarize(label_y = max(flux, na.rm = TRUE) * 1.05, .groups = "drop")
# Merge label positions into the dataset
Metabolite <- Metabolite %>%
left_join(label_positions, by = "metabolite")
# Plot metabolic flux for selected metabolites
ggplot(Metabolite, aes(x = Protein_source, y = flux, fill = Protein_source)) +
geom_bar_pattern(
stat = "summary",
fun = "sum",
position = "dodge",
pattern = "crosshatch", # Add crosshatch pattern for checked look
pattern_density = 0.3, # Adjust density of the pattern
pattern_color = "black" # Outline color of the pattern
) +
stat_compare_means(
method = "kruskal.test",
label = "p.format", # Display the actual p-value
size = 3, # Adjust text size
fontface = "bold", # Make the p-value bold
label.y = max(Metabolite$flux) *55.05 # Position the p-value slightly above the highest flux value
) +
facet_wrap(~ metabolite, scales = "free_y", labeller = labeller(.default = label_value)) + # Remove "metabolite:" prefix
theme_classic2() +
labs(
x = "Protein Source",
y = "Total Metabolic Flux [mmol/h]"
) +
theme(
axis.text.x = element_text(face = "bold"),
axis.text.y = element_text(face = "bold"),
axis.title.x = element_text(face = "bold"),
axis.title.y = element_text(face = "bold"),
legend.position = "", # Show legend
strip.text = element_text(face = "bold") # Make facet labels bold
) +
scale_fill_manual(
values = c("Plant" = "blue", "Animal" = "red"), # Fill color mapping
guide = guide_legend(override.aes = list(pattern = "crosshatch")) # Legend pattern
)
# Filter for "tma_m", "so3_m", "h2s_m" metabolites
Metabolite <- subset(flux.merged, metabolite %in% c("tma_m", "so3_m", "h2s_m"))
# Calculate maximum y-values for each metabolite
label_positions <- Metabolite %>%
group_by(metabolite) %>%
summarize(label_y = max(flux, na.rm = TRUE) * 1.05, .groups = "drop")
# Merge label positions into the dataset
Metabolite <- Metabolite %>%
left_join(label_positions, by = "metabolite")
# Plot metabolic flux for selected metabolites
ggplot(Metabolite, aes(x = Protein_source, y = flux, fill = Protein_source)) +
geom_bar_pattern(
stat = "summary",
fun = "sum",
position = "dodge",
pattern = "crosshatch", # Add crosshatch pattern for checked look
pattern_density = 0.3, # Adjust density of the pattern
pattern_color = "black" # Outline color of the pattern
) +
stat_compare_means(
method = "kruskal.test",
label = "p.format", # Display the actual p-value
size = 3, # Adjust text size
fontface = "bold", # Make the p-value bold
label.y = max(Metabolite$flux) *120.05 # Position the p-value slightly above the highest flux value
) +
facet_wrap(~ metabolite, scales = "free_y", labeller = labeller(.default = label_value)) + # Remove "metabolite:" prefix
theme_classic2() +
labs(
x = "Protein Source",
y = "Total Metabolic Flux [mmol/h]"
) +
theme(
axis.text.x = element_text(face = "bold"),
axis.text.y = element_text(face = "bold"),
axis.title.x = element_text(face = "bold"),
axis.title.y = element_text(face = "bold"),
legend.position = "", # Show legend
strip.text = element_text(face = "bold") # Make facet labels bold
) +
scale_fill_manual(
values = c("Plant" = "blue", "Animal" = "red"), # Fill color mapping
guide = guide_legend(override.aes = list(pattern = "crosshatch")) # Legend pattern
)
# Filter for "hdca_m", "ocdca_m" metabolites
Metabolite <- subset(flux.merged, metabolite %in% c("hdca_m", "ocdca_m"))
# Calculate maximum y-values for each metabolite
label_positions <- Metabolite %>%
group_by(metabolite) %>%
summarize(label_y = max(flux, na.rm = TRUE) * 1.05, .groups = "drop")
# Merge label positions into the dataset
Metabolite <- Metabolite %>%
left_join(label_positions, by = "metabolite")
# Plot metabolic flux for selected metabolites
ggplot(Metabolite, aes(x = Protein_source, y = flux, fill = Protein_source)) +
geom_bar_pattern(
stat = "summary",
fun = "sum",
position = "dodge",
pattern = "crosshatch", # Add crosshatch pattern for checked look
pattern_density = 0.3, # Adjust density of the pattern
pattern_color = "black" # Outline color of the pattern
) +
stat_compare_means(
method = "kruskal.test",
label = "p.format", # Display the actual p-value
size = 3, # Adjust text size
fontface = "bold", # Make the p-value bold
label.y = max(Metabolite$flux) *40 # Position the p-value slightly above the highest flux value
) +
facet_wrap(~ metabolite, scales = "free_y", labeller = labeller(.default = label_value)) + # Remove "metabolite:" prefix
theme_classic2() +
labs(
x = "Protein Source",
y = "Total Metabolic Flux [mmol/h]"
) +
theme(
axis.text.x = element_text(face = "bold"),
axis.text.y = element_text(face = "bold"),
axis.title.x = element_text(face = "bold"),
axis.title.y = element_text(face = "bold"),
legend.position = "", # Show legend
strip.text = element_text(face = "bold") # Make facet labels bold
) +
scale_fill_manual(
values = c("Plant" = "blue", "Animal" = "red"), # Fill color mapping
guide = guide_legend(override.aes = list(pattern = "crosshatch")) # Legend pattern
)
flux_metadata <- read.csv("BCFA_ABUNDANCE1.csv")
flux_metadata <- flux_metadata[!duplicated(flux_metadata[, 1]), ]
rownames(flux_metadata) <- flux_metadata[, 1]
flux_metadata <- flux_metadata[, -1] # Drop column used for row names
# Load taxa abundance
taxa_abundance <- read.csv("microbial_abundance.csv", row.names = 1)
# Load metadata for Protein source
sample_metadata <- data.frame(
Sample = colnames(flux_metadata),
Protein_source = c("Plant","Animal")
)
### Compute Correlation and P-values
cor.mtest <- function(mat1, mat2, method = "spearman") {
p_mat <- matrix(NA, nrow = ncol(mat1), ncol = ncol(mat2))
for (i in 1:ncol(mat1)) {
for (j in 1:ncol(mat2)) {
test <- cor.test(mat1[, i], mat2[, j], method = method)
p_mat[i, j] <- test$p.value
}
}
return(list(p = p_mat))
}
cor_matrix <- cor(t(taxa_abundance), flux_metadata, method = "spearman")
p_values <- cor.mtest(t(taxa_abundance), flux_metadata, method = "spearman")$p
### Filter Significant Taxa and Add Markers
significant_taxa <- apply(p_values, 1, function(row) any(row < 0.05))
cor_matrix_significant <- cor_matrix[significant_taxa, ]
p_values_significant <- p_values[significant_taxa, ]
significant_markers <- ifelse(p_values_significant < 0.05, "*", "")
### Top 50 Taxa Analysis
max_cor <- apply(abs(cor_matrix_significant), 1, max)
ranked_taxa <- names(sort(max_cor, decreasing = TRUE))
top_50_taxa <- ranked_taxa[1:50]
cor_matrix_top50 <- cor_matrix_significant[top_50_taxa, , drop = FALSE]
p_values_top50 <- p_values_significant[top_50_taxa, , drop = FALSE]
significant_markers_top50 <- ifelse(p_values_top50 < 0.05, "*", "")
# Calculate Total Metabolic Flux for Each Metabolite
total_flux <- rowSums(flux_metadata)
# Create DataFrame of Metabolites and Their Total Flux
metabolite_flux <- data.frame(
Metabolite = rownames(flux_metadata),
Total_Flux = total_flux
)
# Match metabolites to their Protein source based on sample association
metabolite_flux <- metabolite_flux %>%
mutate(Protein_source = apply(
flux_metadata[Metabolite, ], 1,
function(row) {
# Determine the dominant Protein source by summing fluxes per group
source_sums <- tapply(row, sample_metadata$Protein_source, sum)
names(which.max(source_sums)) # Assign the Protein source with the highest total flux
}
))
# Filter the correlation matrix and annotations for the matched metabolites
cor_matrix_top50 <- cor_matrix_top50[metabolite_flux$Metabolite, , drop = FALSE]
annotation_col <- data.frame(
Protein_source = metabolite_flux$Protein_source
)
rownames(annotation_col) <- metabolite_flux$Metabolite
# Define Annotation Colors
annotation_colors <- list(
Protein_source = c("Plant" = "green", "Animal" = "orange")
)
# Order Columns by Protein Source
ordered_cols <- order(annotation_col$Protein_source)
cor_matrix_top50 <- cor_matrix_top50[, ordered_cols, drop = FALSE]
annotation_col <- annotation_col[ordered_cols, , drop = FALSE]
### Assign Genus and Phylum Annotations Dynamically
taxonomy <- as.data.frame(tax_table(Protein_filt)) # Replace with your phyloseq object
taxonomy <- taxonomy %>%
tibble::rownames_to_column(var = "OTU") %>%
tidyr::replace_na(list(Genus = "Unknown", Phylum = "Unknown"))
otu_to_genus <- taxonomy$Genus
names(otu_to_genus) <- taxonomy$OTU
otu_to_phylum <- taxonomy$Phylum
names(otu_to_phylum) <- taxonomy$OTU
rownames(cor_matrix_top50) <- otu_to_genus[rownames(cor_matrix_top50)]
annotation_row <- data.frame(Phylum = otu_to_phylum[rownames(cor_matrix_top50)])
rownames(annotation_row) <- rownames(cor_matrix_top50)
# Define Phylum Colors
phylum_colors <- setNames(
rainbow(length(unique(annotation_row$Phylum))),
unique(annotation_row$Phylum)
)
annotation_colors <- c(
annotation_colors,
list(Phylum = phylum_colors)
)
### Generate Heatmap
pheatmap(
cor_matrix_top50,
color = colorRampPalette(c("blue", "white", "red"))(100),
display_numbers = significant_markers_top50,
cluster_rows = TRUE,
cluster_cols = FALSE,
main = "Heatmap: Top 50 Taxa vs Metabolite Flux (Spearman Correlation)",
annotation_col = annotation_col,
annotation_row = annotation_row,
annotation_colors = annotation_colors
)
Protein_filt@tax_table %>%
data.frame() %>%
group_by(Phylum) %>%
summarise(n = length(Genus))
## # A tibble: 36 × 2
## Phylum n
## <chr> <int>
## 1 Acidobacteriota 183
## 2 Actinobacteriota 757
## 3 Armatimonadota 14
## 4 Bacteroidota 2982
## 5 Bdellovibrionota 21
## 6 Caldisericota 3
## 7 Calditrichota 2
## 8 Campylobacterota 44
## 9 Chloroflexi 236
## 10 Cloacimonadota 2
## # ℹ 26 more rows
bac_Protein_filt <- subset_taxa(Protein_filt, Kingdom == "d__Bacteria")
bac_Protein_df <- psmelt(bac_Protein_filt)
#Summarize taxa#
number_of_tax <- bac_Protein_df %>%
dplyr::filter(Abundance > 5) %>%
group_by(Protein_source == "Plant", Phylum)
number_of_tax2 <- bac_Protein_df %>%
dplyr::filter(Abundance > 5) %>%
group_by(Protein_source == "Animal", Phylum)
#To place the sets of genera in a list, we use
venn_data <- list(Plant = c(number_of_tax$Phylum[number_of_tax$`Protein_source == "Plant"`]),
Animal = c(number_of_tax2$Phylum[number_of_tax2$`Protein_source == "Animal"`]))
# Plot the Venn diagram and specify colors for each set
ggvenn(
venn_data,
fill_color = c("blue", "red"),
stroke_size = 0.5, set_name_size = 4
)
# Step 1: Filter and clean the phyloseq object
Protein_filt <- Protein_filt %>%
tax_fix(unknowns = c("uncultured", "unclassified", "unknown", "NA")) %>% # Replace unknown values
tax_filter(min_prevalence = 1) %>% # Retain taxa with at least 1 prevalence
phyloseq_validate() # Validate the phyloseq object
print(Protein_filt)
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 10123 taxa and 187 samples ]
## sample_data() Sample Data: [ 187 samples by 26 sample variables ]
## tax_table() Taxonomy Table: [ 10123 taxa by 7 taxonomic ranks ]
## phy_tree() Phylogenetic Tree: [ 10123 tips and 10061 internal nodes ]
# Step 2: Check and handle NAs in the Genus rank
if (any(is.na(tax_table(Protein_filt)[, "Genus"]))) {
cat("NAs found in Genus rank. Replacing with 'Unknown'.\n")
tax_table(Protein_filt)[, "Genus"] <- ifelse(
is.na(tax_table(Protein_filt)[, "Genus"]),
"Unknown",
tax_table(Protein_filt)[, "Genus"]
)
}
# Step 3: Create the bar plot
p1 <- Protein_filt %>%
ps_select(Protein_source, Protein_level) %>% # Select relevant metadata columns
phyloseq::merge_samples(group = "Protein_source") %>% # Merge samples by "Protein_source"
comp_barplot(
tax_level = "Phylum", # Aggregation level for plotting
n_taxa = 10, # Number of taxa to display
bar_width = 0.8, # Adjust bar width
position = position_dodge(width = 0.6), # Adjust spacing between bars
transform = function(x) x * 100 # Convert relative abundance to percentages
) +
labs(
x = "Protein Source", # X-axis label
y = "Relative Abundance (%)" # Y-axis label indicating percentage
) +
theme_classic() + # Apply a clean theme
theme(
axis.text.x = element_text(face = "bold", size = 12), # Customize x-axis text
axis.text.y = element_text(face = "bold", size = 12), # Customize y-axis text
axis.title.x = element_text(face = "bold", size = 14), # Bold x-axis title
axis.title.y = element_text(face = "bold", size = 14), # Bold y-axis title
legend.text = element_text(size = 10), # Adjust legend text size
legend.title = element_text(face = "bold") # Bold legend title
)
# Step 4: Ensure proper ordering of Protein_source levels
if ("Protein_source" %in% colnames(p1$data)) {
p1$data$Protein_source <- factor(p1$data$Protein_source, levels = c("Animal", "Plant"))
} else {
stop("Error: 'Protein_source' column is missing in the plot data.")
}
# Step 5: Display the plot
print(p1)
# Split data into training and test sets
set.seed(28)
trainIndex <- createDataPartition(typeres, p = 0.7, list = FALSE)
train_set <- rfprot[trainIndex, ]
test_set <- rfprot[-trainIndex, ]
# Number of samples in your test set
num_test_samples <- nrow(test_set)
# Generate random labels (1 or 2) for the test data
set.seed(42) # Set seed for reproducibility
random_labels <- sample(c(1, 2), num_test_samples, replace = TRUE)
# Add these random labels to the test set
test_set$random_labels <- random_labels
# Make predictions on the test set
random_predictions <- predict(typeclassval, newdata = test_set)$class
# Create confusion matrix
conf_matrix <- table(Predicted = random_predictions, Actual = test_set$random_labels)
# Convert confusion table to a data frame for further analysis or visualization
conf_df <- as.data.frame(conf_matrix)
# Calculate accuracy
total_predictions <- sum(conf_matrix)
conf_df$Accuracy <- (conf_df$Freq / total_predictions) * 100
# Create a label for each cell with count and accuracy percentage
conf_df$Label <- paste0(conf_df$Freq, " (", round(conf_df$Accuracy, 1), "%)")
# View the confusion matrix with labels
print(conf_df)
# Calculate accuracy
accuracy_random <- sum(diag(conf_matrix)) / sum(conf_matrix)
cat("Accuracy on random test data:", round(accuracy_random * 100, 2), "%\n")
# Convert confusion table to a data frame for visualization
conf_df <- as.data.frame(conf_matrix)
# Create a label for each cell with count and accuracy percentage
conf_df$Accuracy <- (conf_df$Freq / sum(conf_matrix)) * 100
conf_df$Label <- paste0(conf_df$Freq, " (", round(conf_df$Accuracy, 1), "%)")
conf_matrix_plot <- ggplot(conf_df, aes(x = Predicted, y = Actual, fill = Freq)) +
geom_tile(color = "white") +
geom_text(aes(label = Label), color = "black", size = 7) +
scale_fill_gradient(low = "white", high = "blue") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Randomized data Confusion Matrix", x = "Predicted", y = "Actual") +
theme(legend.position = "none")
conf_matrix_plot
top_genera <- unique(imp_merged$Genus) # Assuming the top 10 are already selected
Protein_genera <- Protein_filt %>%
tax_glom(taxrank = "Genus") %>% # agglomerate at phylum level
transform_sample_counts(function(x) {x/sum(x)} ) %>% # Transform to rel. abundance
psmelt() %>% # Melt to long format
dplyr::arrange(Phylum) # Sort data frame alphabetically by phylum
# Create a filtered version of Protein_genera that reflects what you aggregated.
Protein_genera_filtered <- Protein_genera %>%
filter(Genus %in% top_genera)
# Perform the Wilcoxon rank-sum test for each genus
Wil_Plant_vs_Animal <- lapply(top_genera, function(genus) {
Plant_abundance <- Protein_genera_filtered$Abundance[Protein_genera_filtered$Genus == genus & Protein_genera_filtered$Protein_source == "Plant"]
Animal_abundance <- Protein_genera_filtered$Abundance[Protein_genera_filtered$Genus == genus & Protein_genera_filtered$Protein_source == "Animal"]})
# Create the box plot
boxplot_data <- Protein_genera_filtered %>%
filter(Protein_source %in% c("Plant", "Animal")) %>%
ggplot(aes(x = Protein_source, y = Abundance, fill = Protein_source)) +
geom_boxplot() +
scale_y_log10() +
facet_wrap(~Genus, scales = "free_y", nrow = 2) +
stat_compare_means(comparisons = list(c("Plant", "Animal")), label = 'p.signif') +
labs(x = "", y = "Log10 Abundance", fill = "Protein Source") +
theme_classic2() +
scale_fill_manual(values = c("Plant" = "blue", "Animal" = "red")) + # Set custom colors
theme(
strip.text = element_text(face = "bold", size = 10), # Make facet labels bold
axis.text.x = element_blank(), # Bold x-axis label
axis.ticks.x = element_blank(),
axis.title.y = element_text(face = "bold", size = 14) # Bold y-axis label
)
# Print the box plot
print(boxplot_data)